library(data.table)
library(tidyr)

#read the data (Wave 5)

# Data of Wave 5


WV5_data <- readRDS("/Users/cristinacandido/Documents/Github/risk_wvs/data/WVS/F00007944-WV5_Data_R_v20180912.rds")


# Convert WV5_data-object in data.frame 
WV5_data_df <- as.data.frame(WV5_data)

# show first five columns
head(WV5_data_df[, 1:5])

clean the data set

library(dplyr)

#rename the variables
WV5_data <- WV5_data_df %>%
  rename(risk_and_adventure = V86, sex = V235, age = V237, country = V2, wave = V1)
WV5_data


#select only the variables of interest
WV5_data <- WV5_data %>%
  select(risk_and_adventure, sex, age, country, wave)
WV5_data
#exlcusion of participants with no info about risk, sex, age, employment, merital status and children 
WV5_data_df = subset(WV5_data, risk_and_adventure > 0 & sex > 0 & age >0)
#decode the country names 
countrynames = read.csv("/Users/cristinacandido/Documents/Github/risk_wvs/data/WVS/countrynames.txt", header=FALSE,as.is=TRUE)
colnames(countrynames) = c("code", "name")
WV5_data$country_lab = countrynames$name [match(WV5_data$country, countrynames$code)]
table(WV5_data$country_lab)

            Andorra           Argentina           Australia              Brazil            Bulgaria 
               1003                1002                1421                1500                1001 
       Burkina Faso              Canada               Chile               China            Colombia 
               1534                2164                1000                1991                3025 
         Cyprus (G)               Egypt            Ethiopia             Finland              France 
               1050                3051                1500                1014                1001 
            Georgia             Germany               Ghana       Great Britain           Guatemala 
               1500                2064                1534                1041                1000 
          Hong Kong             Hungary               India           Indonesia                Iran 
               1252                1007                2001                2015                2667 
               Iraq               Italy               Japan              Jordan            Malaysia 
               2701                1012                1096                1200                1201 
               Mali              Mexico             Moldova             Morocco         Netherlands 
               1534                1560                1046                1200                1050 
        New Zealand              Norway                Peru              Poland             Romania 
                954                1025                1500                1000                1776 
             Russia              Rwanda            Slovenia        South Africa         South Korea 
               2033                1507                1037                2988                1200 
              Spain              Sweden         Switzerland              Taiwan            Thailand 
               1200                1003                1241                1227                1534 
Trinidad and Tobago              Turkey             Ukraine       United States             Uruguay 
               1002                1346                1000                1249                1000 
           Viet Nam              Zambia 
               1495                1500 
WV5_data
NA
NA

#Read Dataset (Wave 6)

WV6_data <- load("/Users/cristinacandido/Documents/Github/risk_wvs/data/WVS/WV6_Data_R_v20201117.rdata") 
WV6_data <- WV6_Data_R_v20201117 
print(WV6_data)

` ``{r} #rename variables

WV6_data <- WV6_data %>%
  rename(wave = V1, risk_and_adventure = V76, sex = V240, age = V242, education = V237, country = V2)


#select only the variables of interest
WV6_data <- WV6_data %>%
  select(risk_and_adventure, sex, age, country, wave)
WV6_data
NA

#decode daraset (Wave 6)

countrynames = read.csv("/Users/cristinacandido/Documents/Github/risk_wvs/data/WVS/countrynames.txt", header=FALSE,as.is=TRUE)
colnames(countrynames) = c("code", "name")
WV6_data$country_lab = countrynames$name [match(WV6_data$country, countrynames$code)]
table(WV6_data$country_lab)

            Algeria           Argentina             Armenia           Australia          Azerbaijan 
               1200                1030                1100                1477                1002 
            Belarus              Brazil               Chile               China            Colombia 
               1535                1486                1000                2300                1512 
         Cyprus (G)             Ecuador               Egypt             Estonia             Georgia 
               1000                1202                1523                1533                1202 
            Germany               Ghana               Haiti           Hong Kong               India 
               2046                1552                1996                1000                4078 
               Iraq               Japan              Jordan          Kazakhstan              Kuwait 
               1200                2443                1200                1500                1303 
         Kyrgyzstan             Lebanon               Libya            Malaysia              Mexico 
               1500                1200                2131                1300                2000 
            Morocco         Netherlands         New Zealand             Nigeria            Pakistan 
               1200                1902                 841                1759                1200 
          Palestine                Peru         Philippines              Poland               Qatar 
               1000                1210                1200                 966                1060 
            Romania              Russia              Rwanda           Singapore            Slovenia 
               1503                2500                1527                1972                1069 
       South Africa         South Korea               Spain              Sweden              Taiwan 
               3531                1200                1189                1206                1238 
           Thailand Trinidad and Tobago             Tunisia              Turkey             Ukraine 
               1200                 999                1205                1605                1500 
      United States             Uruguay          Uzbekistan               Yemen            Zimbabwe 
               2232                1000                1500                1000                1500 
WV6_data

#exclude participants with no info about risk, sex, and age

WV6_data = subset(WV6_data, risk_and_adventure > 0 & sex > 0 & age >0)

#combine the 2 dataset (Wave 6 + Wave 5)

data = rbind(WV5_data, WV6_data)
data

#number of countries

length(unique(data$country_lab))
[1] 80

#number of participants

nrow(data)
[1] 170195

#exclusion of participants

data = subset(data, risk_and_adventure > 0 & sex > 0 & age > 0)
data
NA

#number of males vs females (1 = males; 2 = females)

table(data$sex)

    1     2 
75737 81963 

#create a categorical age variable

data$agecat[data$age<20]="15-19"
data$agecat[data$age>=20 & data$age <30] = "20-29"
data$agecat[data$age>=30 & data$age <40] = "30-39"
data$agecat[data$age>=40 & data$age <50] = "40-49"
data$agecat[data$age>=50 & data$age <60] = "50-59"
data$agecat[data$age>=60 & data$age <70] = "60-69"
data$agecat[data$age>=70 & data$age <80] = "70-79"
data$agecat[data$age>=80] = "80+"

#gender variables

data$sex[data$sex == 1] <- "male"
data$sex[data$sex == 2] <- "female"

#average age of participants

mean(data$age)

#age range

range(data$age) 

#risk taking Frequency

ggplot(data, aes(x = risk_and_adventure)) +
  geom_histogram(binwidth = 0.5, fill = "lightblue", color = "black") +
  labs(x = "Risk Taking", y = "Frequency", title = "Histogram of Risk Taking") +
  theme_minimal()

#age frequency

ggplot(data, aes(x = age)) +
  geom_histogram(binwidth = 0.5, fill = "lightblue", color = "black") +
  labs(x = "Age", y = "Frequency", title = "Histogram of Age Distributionn") +
  theme_minimal()

#age vs risk taking


ggplot(data, aes(x = agecat, y = risk_and_adventure)) +
  geom_boxplot() +
  labs(title = "Boxplot of Risk and Adventure by Age",
       x = "Age",
       y = "Risk and Adventure") +
  theme_minimal()

NA
NA

#sex vs risk taking

ggplot(data, aes(as.factor(sex), risk_and_adventure ))+
  geom_boxplot()

data
LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKYGBge3J9CmxpYnJhcnkoZGF0YS50YWJsZSkKbGlicmFyeSh0aWR5cikKYGBgCgojcmVhZCB0aGUgZGF0YSAoV2F2ZSA1KQpgYGB7cn0KIyBEYXRhIG9mIFdhdmUgNQoKCldWNV9kYXRhIDwtIHJlYWRSRFMoIi9Vc2Vycy9jcmlzdGluYWNhbmRpZG8vRG9jdW1lbnRzL0dpdGh1Yi9yaXNrX3d2cy9kYXRhL1dWUy9GMDAwMDc5NDQtV1Y1X0RhdGFfUl92MjAxODA5MTIucmRzIikKCgojIENvbnZlcnQgV1Y1X2RhdGEtb2JqZWN0IGluIGRhdGEuZnJhbWUgCldWNV9kYXRhX2RmIDwtIGFzLmRhdGEuZnJhbWUoV1Y1X2RhdGEpCgojIHNob3cgZmlyc3QgZml2ZSBjb2x1bW5zCmhlYWQoV1Y1X2RhdGFfZGZbLCAxOjVdKQpgYGAKCiMgY2xlYW4gdGhlIGRhdGEgc2V0CmBgYHtyfQpsaWJyYXJ5KGRwbHlyKQoKI3JlbmFtZSB0aGUgdmFyaWFibGVzCldWNV9kYXRhIDwtIFdWNV9kYXRhX2RmICU+JQogIHJlbmFtZShyaXNrX2FuZF9hZHZlbnR1cmUgPSBWODYsIHNleCA9IFYyMzUsIGFnZSA9IFYyMzcsIGNvdW50cnkgPSBWMiwgd2F2ZSA9IFYxKQpXVjVfZGF0YQoKCiNzZWxlY3Qgb25seSB0aGUgdmFyaWFibGVzIG9mIGludGVyZXN0CldWNV9kYXRhIDwtIFdWNV9kYXRhICU+JQogIHNlbGVjdChyaXNrX2FuZF9hZHZlbnR1cmUsIHNleCwgYWdlLCBjb3VudHJ5LCB3YXZlKQpXVjVfZGF0YQpgYGAKCmBgYHtyfQojZXhsY3VzaW9uIG9mIHBhcnRpY2lwYW50cyB3aXRoIG5vIGluZm8gYWJvdXQgcmlzaywgc2V4LCBhZ2UsIGVtcGxveW1lbnQsIG1lcml0YWwgc3RhdHVzIGFuZCBjaGlsZHJlbiAKV1Y1X2RhdGFfZGYgPSBzdWJzZXQoV1Y1X2RhdGEsIHJpc2tfYW5kX2FkdmVudHVyZSA+IDAgJiBzZXggPiAwICYgYWdlID4wKQpgYGAKCmBgYHtyfQojZGVjb2RlIHRoZSBjb3VudHJ5IG5hbWVzIApjb3VudHJ5bmFtZXMgPSByZWFkLmNzdigiL1VzZXJzL2NyaXN0aW5hY2FuZGlkby9Eb2N1bWVudHMvR2l0aHViL3Jpc2tfd3ZzL2RhdGEvV1ZTL2NvdW50cnluYW1lcy50eHQiLCBoZWFkZXI9RkFMU0UsYXMuaXM9VFJVRSkKY29sbmFtZXMoY291bnRyeW5hbWVzKSA9IGMoImNvZGUiLCAibmFtZSIpCldWNV9kYXRhJGNvdW50cnlfbGFiID0gY291bnRyeW5hbWVzJG5hbWUgW21hdGNoKFdWNV9kYXRhJGNvdW50cnksIGNvdW50cnluYW1lcyRjb2RlKV0KdGFibGUoV1Y1X2RhdGEkY291bnRyeV9sYWIpCldWNV9kYXRhCgoKYGBgCgojUmVhZCBEYXRhc2V0IChXYXZlIDYpCmBgYHtyfQpXVjZfZGF0YSA8LSBsb2FkKCIvVXNlcnMvY3Jpc3RpbmFjYW5kaWRvL0RvY3VtZW50cy9HaXRodWIvcmlza193dnMvZGF0YS9XVlMvV1Y2X0RhdGFfUl92MjAyMDExMTcucmRhdGEiKSAKV1Y2X2RhdGEgPC0gV1Y2X0RhdGFfUl92MjAyMDExMTcgCnByaW50KFdWNl9kYXRhKQpgYGAKYApgYHtyfQojcmVuYW1lIHZhcmlhYmxlcwpgYGB7cn0KV1Y2X2RhdGEgPC0gV1Y2X2RhdGEgJT4lCiAgcmVuYW1lKHdhdmUgPSBWMSwgcmlza19hbmRfYWR2ZW50dXJlID0gVjc2LCBzZXggPSBWMjQwLCBhZ2UgPSBWMjQyLCBlZHVjYXRpb24gPSBWMjM3LCBjb3VudHJ5ID0gVjIpCgoKI3NlbGVjdCBvbmx5IHRoZSB2YXJpYWJsZXMgb2YgaW50ZXJlc3QKV1Y2X2RhdGEgPC0gV1Y2X2RhdGEgJT4lCiAgc2VsZWN0KHJpc2tfYW5kX2FkdmVudHVyZSwgc2V4LCBhZ2UsIGNvdW50cnksIHdhdmUpCldWNl9kYXRhCgpgYGAKCgojZGVjb2RlIGRhcmFzZXQgKFdhdmUgNikKYGBge3J9CmNvdW50cnluYW1lcyA9IHJlYWQuY3N2KCIvVXNlcnMvY3Jpc3RpbmFjYW5kaWRvL0RvY3VtZW50cy9HaXRodWIvcmlza193dnMvZGF0YS9XVlMvY291bnRyeW5hbWVzLnR4dCIsIGhlYWRlcj1GQUxTRSxhcy5pcz1UUlVFKQpjb2xuYW1lcyhjb3VudHJ5bmFtZXMpID0gYygiY29kZSIsICJuYW1lIikKV1Y2X2RhdGEkY291bnRyeV9sYWIgPSBjb3VudHJ5bmFtZXMkbmFtZSBbbWF0Y2goV1Y2X2RhdGEkY291bnRyeSwgY291bnRyeW5hbWVzJGNvZGUpXQp0YWJsZShXVjZfZGF0YSRjb3VudHJ5X2xhYikKV1Y2X2RhdGEKYGBgCgojZXhjbHVkZSBwYXJ0aWNpcGFudHMgd2l0aCBubyBpbmZvIGFib3V0IHJpc2ssIHNleCwgYW5kIGFnZQpgYGB7cn0KV1Y2X2RhdGEgPSBzdWJzZXQoV1Y2X2RhdGEsIHJpc2tfYW5kX2FkdmVudHVyZSA+IDAgJiBzZXggPiAwICYgYWdlID4wKQpgYGAKCiNjb21iaW5lIHRoZSAyIGRhdGFzZXQgKFdhdmUgNiArIFdhdmUgNSkKYGBge3J9CmRhdGEgPSByYmluZChXVjVfZGF0YSwgV1Y2X2RhdGEpCmRhdGEKYGBgCiNudW1iZXIgb2YgY291bnRyaWVzCmBgYHtyfQpsZW5ndGgodW5pcXVlKGRhdGEkY291bnRyeV9sYWIpKQpgYGAKCiNudW1iZXIgb2YgcGFydGljaXBhbnRzCmBgYHtyfQpucm93KGRhdGEpCmBgYAojZXhjbHVzaW9uIG9mIHBhcnRpY2lwYW50cwpgYGB7cn0KZGF0YSA9IHN1YnNldChkYXRhLCByaXNrX2FuZF9hZHZlbnR1cmUgPiAwICYgc2V4ID4gMCAmIGFnZSA+IDApCmRhdGEKCmBgYAojbnVtYmVyIG9mIG1hbGVzIHZzIGZlbWFsZXMgKDEgPSBtYWxlczsgMiA9IGZlbWFsZXMpCmBgYHtyfQp0YWJsZShkYXRhJHNleCkKYGBgCiNjcmVhdGUgYSBjYXRlZ29yaWNhbCBhZ2UgdmFyaWFibGUKYGBge3J9CmRhdGEkYWdlY2F0W2RhdGEkYWdlPDIwXT0iMTUtMTkiCmRhdGEkYWdlY2F0W2RhdGEkYWdlPj0yMCAmIGRhdGEkYWdlIDwzMF0gPSAiMjAtMjkiCmRhdGEkYWdlY2F0W2RhdGEkYWdlPj0zMCAmIGRhdGEkYWdlIDw0MF0gPSAiMzAtMzkiCmRhdGEkYWdlY2F0W2RhdGEkYWdlPj00MCAmIGRhdGEkYWdlIDw1MF0gPSAiNDAtNDkiCmRhdGEkYWdlY2F0W2RhdGEkYWdlPj01MCAmIGRhdGEkYWdlIDw2MF0gPSAiNTAtNTkiCmRhdGEkYWdlY2F0W2RhdGEkYWdlPj02MCAmIGRhdGEkYWdlIDw3MF0gPSAiNjAtNjkiCmRhdGEkYWdlY2F0W2RhdGEkYWdlPj03MCAmIGRhdGEkYWdlIDw4MF0gPSAiNzAtNzkiCmRhdGEkYWdlY2F0W2RhdGEkYWdlPj04MF0gPSAiODArIgpgYGAKCgojZ2VuZGVyIHZhcmlhYmxlcwpgYGB7cn0KZGF0YSRzZXhbZGF0YSRzZXggPT0gMV0gPC0gIm1hbGUiCmRhdGEkc2V4W2RhdGEkc2V4ID09IDJdIDwtICJmZW1hbGUiCmBgYAoKI2F2ZXJhZ2UgYWdlIG9mIHBhcnRpY2lwYW50cwpgYGB7cn0KbWVhbihkYXRhJGFnZSkKYGBgCgojYWdlIHJhbmdlCmBgYHtyfQpyYW5nZShkYXRhJGFnZSkgCmBgYAojcmlzayB0YWtpbmcgRnJlcXVlbmN5CmBgYHtyfQpnZ3Bsb3QoZGF0YSwgYWVzKHggPSByaXNrX2FuZF9hZHZlbnR1cmUpKSArCiAgZ2VvbV9oaXN0b2dyYW0oYmlud2lkdGggPSAwLjUsIGZpbGwgPSAibGlnaHRibHVlIiwgY29sb3IgPSAiYmxhY2siKSArCiAgbGFicyh4ID0gIlJpc2sgVGFraW5nIiwgeSA9ICJGcmVxdWVuY3kiLCB0aXRsZSA9ICJIaXN0b2dyYW0gb2YgUmlzayBUYWtpbmciKSArCiAgdGhlbWVfbWluaW1hbCgpCmBgYAojYWdlIGZyZXF1ZW5jeQpgYGB7cn0KZ2dwbG90KGRhdGEsIGFlcyh4ID0gYWdlKSkgKwogIGdlb21faGlzdG9ncmFtKGJpbndpZHRoID0gMC41LCBmaWxsID0gImxpZ2h0Ymx1ZSIsIGNvbG9yID0gImJsYWNrIikgKwogIGxhYnMoeCA9ICJBZ2UiLCB5ID0gIkZyZXF1ZW5jeSIsIHRpdGxlID0gIkhpc3RvZ3JhbSBvZiBBZ2UgRGlzdHJpYnV0aW9ubiIpICsKICB0aGVtZV9taW5pbWFsKCkKYGBgCiNhZ2UgdnMgcmlzayB0YWtpbmcKYGBge3J9CgpnZ3Bsb3QoZGF0YSwgYWVzKHggPSBhZ2VjYXQsIHkgPSByaXNrX2FuZF9hZHZlbnR1cmUpKSArCiAgZ2VvbV9ib3hwbG90KCkgKwogIGxhYnModGl0bGUgPSAiQm94cGxvdCBvZiBSaXNrIGFuZCBBZHZlbnR1cmUgYnkgQWdlIiwKICAgICAgIHggPSAiQWdlIiwKICAgICAgIHkgPSAiUmlzayBhbmQgQWR2ZW50dXJlIikgKwogIHRoZW1lX21pbmltYWwoKQoKCmBgYAojc2V4IHZzIHJpc2sgdGFraW5nCmBgYHtyfQpnZ3Bsb3QoZGF0YSwgYWVzKGFzLmZhY3RvcihzZXgpLCByaXNrX2FuZF9hZHZlbnR1cmUgKSkrCiAgZ2VvbV9ib3hwbG90KCkKCmBgYApgYGB7cn0KZGF0YQpgYGAKCmBgYHtyfQoKYGBgCgoK